/*******************************************************************************
* Copyright 2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "common_f32.hpp"
#include "jit_generator.hpp"

namespace mkldnn {
namespace impl {
namespace cpu {

jit_avx2_f32_copy_bn_kern::jit_avx2_f32_copy_bn_kern() :
    jit_generator(nullptr, F32_COPY_KERNEL_CODE_SIZE) {

#ifndef _WIN32
#define M   rdi
#define N   rsi
#define A   rdx
#define LDA rcx
#define ALPHA   r8
#define B   r9

#define I   rax
#define A1  r10
#define A2  r8
#define LDA3    r11

#else
#define M   rcx
#define N   rdx
#define A   r8
#define LDA r9
#define ALPHA   rsi
#define B   rdi
#define I   rax
#define A1  rsi
#define A2  r10
#define LDA3    r11

#define ARG_ALPHA   40+stacksize+rsp
#define ARG_B       48+stacksize+rsp

#endif

inLocalLabel();
{

Xbyak::Label l118;
Xbyak::Label l15c;
Xbyak::Label l16c;
Xbyak::Label l18c;
Xbyak::Label l1cc;
Xbyak::Label l200;
Xbyak::Label l230;
Xbyak::Label l234;
Xbyak::Label l254;
Xbyak::Label l294;
Xbyak::Label l2c4;
Xbyak::Label l2e8;
Xbyak::Label l2ec;
Xbyak::Label l2f4;
Xbyak::Label l310;
Xbyak::Label l328;
Xbyak::Label l398;
Xbyak::Label l3ec;
Xbyak::Label l434;
Xbyak::Label l444;
Xbyak::Label l464;
Xbyak::Label l4ac;
Xbyak::Label l4e4;
Xbyak::Label l518;
Xbyak::Label l51c;
Xbyak::Label l53c;
Xbyak::Label l54;
Xbyak::Label l580;
Xbyak::Label l5b4;
Xbyak::Label l5dc;
Xbyak::Label l5e0;
Xbyak::Label l5e8;
Xbyak::Label l5f4;
Xbyak::Label l60c;
Xbyak::Label l67c;
Xbyak::Label l6c;
Xbyak::Label l6d0;
Xbyak::Label l718;
Xbyak::Label l728;
Xbyak::Label l748;
Xbyak::Label l790;
Xbyak::Label l7c8;
Xbyak::Label l7fc;
Xbyak::Label l800;
Xbyak::Label l820;
Xbyak::Label l864;
Xbyak::Label l898;
Xbyak::Label l8c0;
Xbyak::Label l8c4;
Xbyak::Label lcc;

    preamble();
#ifdef _WIN32
    auto stacksize = get_size_of_abi_save_regs();
    mov(ALPHA, ptr[ARG_ALPHA]);
    mov(B, ptr[ARG_B]);
#endif

    mov(M, qword[M]);
    mov(N, qword[N]);
    mov(LDA, qword[LDA]);
    sub(A, 0x0);
    sub(B, -128);
    shl(LDA, 0x2);
    lea(LDA3, ptr[LDA+LDA*2]);
    vbroadcastss(ymm6, dword[ALPHA]);
    vpcmpeqb(xmm3, xmm3, xmm3);
    vpsrld(xmm3, xmm3, 0x17);
    vpslld(xmm3, xmm3, 0x19);
    vpsrld(xmm3, xmm3, 0x2);
    vpcmpeqb(xmm4, xmm4, xmm4);
    vpslld(xmm4, xmm4, 0x1f);
    vperm2f128(ymm4, ymm4, ymm4, 0x20);
    vucomiss(xmm6, xmm3);
    jne(l2f4, T_NEAR);
    cmp(N, 0x4);
    jl(l16c, T_NEAR);
    align(4);

L(l54);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x4);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(lcc, T_NEAR);
    align(4);

L(l6c);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vmovups(xmm2, xword[A1+LDA*2]);
    vmovups(xmm3, xword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm5, xmm0, xmm1);
    vunpcklps(xmm1, xmm2, xmm3);
    vunpckhps(xmm3, xmm2, xmm3);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vunpcklpd(xmm2, xmm5, xmm3);
    vunpckhpd(xmm3, xmm5, xmm3);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    vmovups(xword[B-0x60], xmm2);
    vmovups(xword[B-0x50], xmm3);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -16);
    sub(B, -64);
    dec(I);
    jg(l6c, T_NEAR);
    align(4);

L(lcc);
    test(M, 0x2);
    jle(l118, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vmovhps(xmm0, xmm0, qword[A1+LDA*2]);
    vmovhps(xmm1, xmm1, qword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -8);
    sub(B, -32);
    align(4);

L(l118);
    test(M, 0x1);
    jle(l15c, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmovss(xmm2, dword[A1+LDA*2]);
    vmovss(xmm3, dword[A1+LDA3*1]);
    vunpcklps(xmm2, xmm2, xmm3);
    vunpcklpd(xmm0, xmm0, xmm2);
    vmovups(xword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -4);
    sub(B, -16);
    align(4);

L(l15c);
    sub(N, 0x4);
    cmp(N, 0x4);
    jge(l54, T_NEAR);
    align(4);

L(l16c);
    cmp(N, 0x2);
    jl(l234, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x2);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l1cc, T_NEAR);
    align(4);

L(l18c);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vmovaps(xmm0, xmm4);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    vmovlps(qword[B-0x70], xmm1);
    vmovhps(qword[B-0x68], xmm1);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -16);
    sub(B, -32);
    dec(I);
    jg(l18c, T_NEAR);
    align(4);

L(l1cc);
    test(M, 0x2);
    jle(l200, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -8);
    sub(B, -16);
    align(4);

L(l200);
    test(M, 0x1);
    jle(l230, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmovlps(qword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -4);
    sub(B, -8);
    align(4);

L(l230);
    sub(N, 0x2);
    align(4);

L(l234);
    cmp(N, 0x1);
    jl(l2ec, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x1);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l294, T_NEAR);
    align(4);

L(l254);
    vmovups(xmm0, xword[A1]);
    vpshufd(xmm1, xmm0, 0x55);
    vpshufd(xmm2, xmm0, 0xaa);
    vpshufd(xmm3, xmm0, 0xff);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    vmovss(dword[B-0x78], xmm2);
    vmovss(dword[B-0x74], xmm3);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -16);
    sub(B, -16);
    dec(I);
    jg(l254, T_NEAR);
    align(4);

L(l294);
    test(M, 0x2);
    jle(l2c4, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vpshufd(xmm1, xmm0, 0x55);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -8);
    sub(B, -8);
    align(4);

L(l2c4);
    test(M, 0x1);
    jle(l2e8, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(dword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -4);
    sub(B, -4);
    align(4);

L(l2e8);
    sub(N, 0x1);
    align(4);

L(l2ec);
    jmp(l8c4, T_NEAR);
    align(4);

L(l2f4);
    vxorps(xmm3, xmm3, xmm4);
    vucomiss(xmm6, xmm3);
    jne(l5e8, T_NEAR);
    vmovaps(ymm6, ymm4);
    cmp(N, 0x4);
    jl(l444, T_NEAR);
    align(4);

L(l310);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x4);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l398, T_NEAR);
    align(4);

L(l328);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vmovups(xmm2, xword[A1+LDA*2]);
    vmovups(xmm3, xword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm5, xmm0, xmm1);
    vunpcklps(xmm1, xmm2, xmm3);
    vunpckhps(xmm3, xmm2, xmm3);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vunpcklpd(xmm2, xmm5, xmm3);
    vunpckhpd(xmm3, xmm5, xmm3);
    vxorps(xmm0, xmm6, xmm0);
    vxorps(xmm1, xmm6, xmm1);
    vxorps(xmm2, xmm6, xmm2);
    vxorps(xmm3, xmm6, xmm3);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    vmovups(xword[B-0x60], xmm2);
    vmovups(xword[B-0x50], xmm3);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -16);
    sub(B, -64);
    dec(I);
    jg(l328, T_NEAR);
    align(4);

L(l398);
    test(M, 0x2);
    jle(l3ec, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vmovhps(xmm0, xmm0, qword[A1+LDA*2]);
    vmovhps(xmm1, xmm1, qword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vxorps(xmm0, xmm6, xmm0);
    vxorps(xmm1, xmm6, xmm1);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -8);
    sub(B, -32);
    align(4);

L(l3ec);
    test(M, 0x1);
    jle(l434, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmovss(xmm2, dword[A1+LDA*2]);
    vmovss(xmm3, dword[A1+LDA3*1]);
    vunpcklps(xmm2, xmm2, xmm3);
    vunpcklpd(xmm0, xmm0, xmm2);
    vxorps(xmm0, xmm6, xmm0);
    vmovups(xword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -4);
    sub(B, -16);
    align(4);

L(l434);
    sub(N, 0x4);
    cmp(N, 0x4);
    jge(l310, T_NEAR);
    align(4);

L(l444);
    cmp(N, 0x2);
    jl(l51c, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x2);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l4ac, T_NEAR);
    align(4);

L(l464);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vmovaps(xmm0, xmm4);
    vxorps(xmm0, xmm6, xmm0);
    vxorps(xmm1, xmm6, xmm1);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    vmovlps(qword[B-0x70], xmm1);
    vmovhps(qword[B-0x68], xmm1);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -16);
    sub(B, -32);
    dec(I);
    jg(l464, T_NEAR);
    align(4);

L(l4ac);
    test(M, 0x2);
    jle(l4e4, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vxorps(xmm0, xmm6, xmm0);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -8);
    sub(B, -16);
    align(4);

L(l4e4);
    test(M, 0x1);
    jle(l518, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vxorps(xmm0, xmm6, xmm0);
    vmovlps(qword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -4);
    sub(B, -8);
    align(4);

L(l518);
    sub(N, 0x2);
    align(4);

L(l51c);
    cmp(N, 0x1);
    jl(l5e0, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x1);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l580, T_NEAR);
    align(4);

L(l53c);
    vmovups(xmm0, xword[A1]);
    vxorps(xmm0, xmm6, xmm0);
    vpshufd(xmm1, xmm0, 0x55);
    vpshufd(xmm2, xmm0, 0xaa);
    vpshufd(xmm3, xmm0, 0xff);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    vmovss(dword[B-0x78], xmm2);
    vmovss(dword[B-0x74], xmm3);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -16);
    sub(B, -16);
    dec(I);
    jg(l53c, T_NEAR);
    align(4);

L(l580);
    test(M, 0x2);
    jle(l5b4, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vxorps(xmm0, xmm6, xmm0);
    vpshufd(xmm1, xmm0, 0x55);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -8);
    sub(B, -8);
    align(4);

L(l5b4);
    test(M, 0x1);
    jle(l5dc, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vxorps(xmm0, xmm6, xmm0);
    vmovss(dword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -4);
    sub(B, -4);
    align(4);

L(l5dc);
    sub(N, 0x1);
    align(4);

L(l5e0);
    jmp(l8c4, T_NEAR);
    align(4);

L(l5e8);
    cmp(N, 0x4);
    jl(l728, T_NEAR);
    align(4);

L(l5f4);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x4);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l67c, T_NEAR);
    align(4);

L(l60c);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vmovups(xmm2, xword[A1+LDA*2]);
    vmovups(xmm3, xword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm5, xmm0, xmm1);
    vunpcklps(xmm1, xmm2, xmm3);
    vunpckhps(xmm3, xmm2, xmm3);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vunpcklpd(xmm2, xmm5, xmm3);
    vunpckhpd(xmm3, xmm5, xmm3);
    vmulps(xmm0, xmm6, xmm0);
    vmulps(xmm1, xmm6, xmm1);
    vmulps(xmm2, xmm6, xmm2);
    vmulps(xmm3, xmm6, xmm3);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    vmovups(xword[B-0x60], xmm2);
    vmovups(xword[B-0x50], xmm3);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -16);
    sub(B, -64);
    dec(I);
    jg(l60c, T_NEAR);
    align(4);

L(l67c);
    test(M, 0x2);
    jle(l6d0, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vmovhps(xmm0, xmm0, qword[A1+LDA*2]);
    vmovhps(xmm1, xmm1, qword[A1+LDA3*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vunpcklpd(xmm0, xmm4, xmm1);
    vunpckhpd(xmm1, xmm4, xmm1);
    vmulps(xmm0, xmm6, xmm0);
    vmulps(xmm1, xmm6, xmm1);
    vmovups(xword[B-0x80], xmm0);
    vmovups(xword[B-0x70], xmm1);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -8);
    sub(B, -32);
    align(4);

L(l6d0);
    test(M, 0x1);
    jle(l718, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmovss(xmm2, dword[A1+LDA*2]);
    vmovss(xmm3, dword[A1+LDA3*1]);
    vunpcklps(xmm2, xmm2, xmm3);
    vunpcklpd(xmm0, xmm0, xmm2);
    vmulps(xmm0, xmm6, xmm0);
    vmovups(xword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*4]);
    sub(A1, -4);
    sub(B, -16);
    align(4);

L(l718);
    sub(N, 0x4);
    cmp(N, 0x4);
    jge(l5f4, T_NEAR);
    align(4);

L(l728);
    cmp(N, 0x2);
    jl(l800, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x2);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l790, T_NEAR);
    align(4);

L(l748);
    vmovups(xmm0, xword[A1]);
    vmovups(xmm1, xword[A1+LDA*1]);
    vunpcklps(xmm4, xmm0, xmm1);
    vunpckhps(xmm1, xmm0, xmm1);
    vmovaps(xmm0, xmm4);
    vmulps(xmm0, xmm6, xmm0);
    vmulps(xmm1, xmm6, xmm1);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    vmovlps(qword[B-0x70], xmm1);
    vmovhps(qword[B-0x68], xmm1);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -16);
    sub(B, -32);
    dec(I);
    jg(l748, T_NEAR);
    align(4);

L(l790);
    test(M, 0x2);
    jle(l7c8, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmovsd(xmm1, qword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmulps(xmm0, xmm6, xmm0);
    vmovlps(qword[B-0x80], xmm0);
    vmovhps(qword[B-0x78], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -8);
    sub(B, -16);
    align(4);

L(l7c8);
    test(M, 0x1);
    jle(l7fc, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmovss(xmm1, dword[A1+LDA*1]);
    vunpcklps(xmm0, xmm0, xmm1);
    vmulps(xmm0, xmm6, xmm0);
    vmovlps(qword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*2]);
    sub(A1, -4);
    sub(B, -8);
    align(4);

L(l7fc);
    sub(N, 0x2);
    align(4);

L(l800);
    cmp(N, 0x1);
    jl(l8c4, T_NEAR);
    mov(A1, A);
    mov(I, LDA);
    imul(I, I, 0x1);
    add(A, I);
    mov(I, M);
    sar(I, 0x2);
    jle(l864, T_NEAR);
    align(4);

L(l820);
    vmovups(xmm0, xword[A1]);
    vmulps(xmm0, xmm6, xmm0);
    vpshufd(xmm1, xmm0, 0x55);
    vpshufd(xmm2, xmm0, 0xaa);
    vpshufd(xmm3, xmm0, 0xff);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    vmovss(dword[B-0x78], xmm2);
    vmovss(dword[B-0x74], xmm3);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -16);
    sub(B, -16);
    dec(I);
    jg(l820, T_NEAR);
    align(4);

L(l864);
    test(M, 0x2);
    jle(l898, T_NEAR);
    vmovsd(xmm0, qword[A1]);
    vmulps(xmm0, xmm6, xmm0);
    vpshufd(xmm1, xmm0, 0x55);
    vmovss(dword[B-0x80], xmm0);
    vmovss(dword[B-0x7c], xmm1);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -8);
    sub(B, -8);
    align(4);

L(l898);
    test(M, 0x1);
    jle(l8c0, T_NEAR);
    vmovss(xmm0, dword[A1]);
    vmulps(xmm0, xmm6, xmm0);
    vmovss(dword[B-0x80], xmm0);
    lea(A2, ptr[A1+LDA*1]);
    sub(A1, -4);
    sub(B, -4);
    align(4);

L(l8c0);
    sub(N, 0x1);
    align(4);

L(l8c4);

    postamble();
}
outLocalLabel();

#undef M
#undef N
#undef A
#undef LDA
#undef ALPHA
#undef B
#undef I
#undef A1
#undef A2
#undef LDA3
#ifdef _WIN32
#undef ARG_ALPHA
#undef ARG_B
#endif
}

}
}
}
